Descriptive Statistics of Shared vs Solo Rides in Chicago in Nov 2019

Initial Preparation: (not important to the reader)

packages <- c("tidyverse", "tidycensus", "tigris", "sp", "scales", "reshape",
              "ggspatial", "lubridate", "cowplot", "viridis", "grid")
invisible(lapply(packages, suppressPackageStartupMessages(require), character.only=TRUE, quietly=TRUE,
                 warn.conflicts = FALSE)) 
rm(packages)
setwd("/Users/taiebat/Desktop/Chicago-TNC-analysis/")
load("/Users/taiebat/Desktop/TNC_Trips_data_clean.RData")
# Change variable names
names <- c( "trip_id" ,  "trip_seconds" ,  "trip_miles", 
            "pickup_census_tract", "dropoff_census_tract" , 
            "pickup_community_area", "dropoff_community_area", "fare" ,                 
            "tip" ,  "additional_charges" , "trip_total" ,           
            "shared_trip_authorized", "trips_pooled",
            "pickup_centroid_latitude", "pickup_centroid_longitude", "pickup_centroid_location",
            "dropoff_centroid_latitude", "dropoff_centroid_longitude", "dropoff_centroid_location",
            "start", "end", "start_date", "start_time", "start_day", 
            "end_date", "end_time", "speed", "pooled")
colnames(data) <- names; data["trip_id"] <-NULL; rm(names)

# Use the following function to infer the missing pickup/dropoff census tracts. 
source("~/Desktop/Chicago-TNC-analysis/complete_missing_tracts.R")
data <- complete_missing_tracts(data)

# This function adds two columns to the data "start_neighborhood" and "end_neighborhood"
# number of neighborhoods = 10
# It also prints the pick up and drop off neighborhoods by count of trips
source("~/Desktop/Chicago-TNC-analysis/community_name.R")
data <- neighborhood(data)

# This function calculates the pre-policy and post-policy tax and tax change based 
# on neighborhood and time of the day (surcharge).
# List of changes in tax policy:
# Pre-policy: $0.72
# Post-policy:
#   central + solo + surcharge (6:21): $3
#   central + shared : $1.25
#   citywide + solo: $1.25
#   citywide + shared: $0.65
# No change in airport pickup/dropoff tax ($5)
source("~/Desktop/Chicago-TNC-analysis/community_name.R")
data <- calculate_tax(data)


### Check if shared trip was authorized (1)
data <- data %>% 
  mutate("shared_authorized" = if_else(shared_trip_authorized == "true", 1, 0))

# This function calls Chicago geometric from census package and picks only 
# tracts of Cook's county which are within the limits of city of Chicago
chicago_tracts <- function() {
  chicago <- get_acs(geography = "tract", 
                     variables = "B19013_001", 
                     state = "IL", 
                     county = c("Cook"), 
                     geometry = T)
  # MEDIAN HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2018 INFLATION-ADJUSTED DOLLARS)
  
  chicago_tracts <- read.csv("/Users/taiebat/Desktop/CensusTractsTIGER2010.csv")
  needed<-which(chicago$GEOID %in% chicago_tracts$GEOID10)    
  chicago <- chicago[needed,]
  return(chicago)
}
chicago <- chicago_tracts()

Section 1: Zonal level analysis

In this section, we only focus on analysis at either pick up or drop off tract level (not zone to zone or OD pairs). The following figure show aggregate-level summary statistics at pick up tracts and drop off tracts.

By Pick Up

pickups <- data %>% 
  group_by(GEOID = pickup_census_tract) %>% 
  summarise("sum_trip" = n(),                          # total number of trips
            "sum_trip_shared_ok" = sum(shared_authorized > 0), # number of trip authorized to be shared
            "sum_trip_shared" = sum(pooled[shared_authorized > 0] > 0)) %>%           # number of trips with successful shared match
  mutate("ratio_shared_ok" = sum_trip_shared_ok / sum_trip,
         "ratio_matched" = sum_trip_shared / sum_trip_shared_ok) %>% 
  mutate(GEOID = as.character(GEOID)) %>% 
    drop_na()

  
  pickups %>% 
    select(ratio_shared_ok, ratio_matched, GEOID) %>% 
    pivot_longer(c("ratio_shared_ok", "ratio_matched")) %>% 
  ggplot()+
    geom_violin(aes(x= name, y= value)) + 
    geom_boxplot(aes(x = name, y = value ), width = 0.02) +
    theme_bw() + labs(title = "By Pick Up Tract", x = "", y = "Ratio") +
    scale_x_discrete(labels=c("ratio_shared_ok" = "ratio of trips\nauthorized to\nbe shared",
                              "ratio_matched" = "ratio of shared\nauthorized trips\nsuccessfully matched"))

# Function to cut and create labels
source("~/Desktop/Chicago-TNC-analysis/quantiles_cut.R")
pickups <- quantiles_cut(input = pickups, variable = "sum_trip", n_cut = 10, n_round = 0)
## Warning: rename_() is deprecated. 
## Please use rename() instead
## 
## The 'programming' vignette or the tidyeval book can help you
## to program with rename() : https://tidyeval.tidyverse.org
## This warning is displayed once per session.
chicago %>%  left_join(pickups, by = "GEOID") %>% filter(!is.na(sum_trip_cut)) %>% 
  ggplot(aes(fill = sum_trip_cut)) + 
  geom_sf(color = NA) + 
  theme_bw() + 
  labs(fill = "" , 
       title = "Number of Total Trips",
       subtitle = "By Pick Up Census Tract")+
  scale_fill_viridis_d(direction = -1)  

pickups <- quantiles_cut(input = pickups, variable = "ratio_shared_ok", n_cut = 10)
chicago %>%  left_join(pickups, by = "GEOID") %>% filter(!is.na(ratio_shared_ok_cut)) %>% 
  ggplot(aes(fill = ratio_shared_ok_cut)) + 
  geom_sf(color = NA) + 
  theme_bw() + 
  labs(fill = "" , 
       title = "Ratio of Trips Authorized for Shared Ride",
       subtitle = "By Pick Up Census Tract")+
  scale_fill_viridis_d(direction = 1)

By Drop Off

dropoffs <- data %>% 
  group_by(GEOID = dropoff_census_tract) %>% 
  summarise("sum_trip" = n(),                          # total number of trips
            "sum_trip_shared_ok" = sum(shared_authorized > 0), # number of trip authorized to be shared
            "sum_trip_shared" = sum(pooled[shared_authorized > 0] > 0)) %>%           # number of trips with successful shared match
  mutate("ratio_shared_ok" = sum_trip_shared_ok / sum_trip,
         "ratio_matched" = sum_trip_shared / sum_trip_shared_ok) %>% 
  mutate(GEOID = as.character(GEOID)) %>% 
    drop_na()

  
  dropoffs %>% 
    select(ratio_shared_ok, ratio_matched, GEOID) %>% 
    pivot_longer(c("ratio_shared_ok", "ratio_matched")) %>% 
  ggplot()+
    geom_violin(aes(x= name, y= value)) + 
    geom_boxplot(aes(x = name, y = value ), width = 0.02) +
    theme_bw() + labs(title = "By Drop Off Tract", x = "", y = "Ratio") +
    scale_x_discrete(labels=c("ratio_shared_ok" = "ratio of trips\nauthorized to\nbe shared",
                              "ratio_matched" = "ratio of shared\nauthorized trips\nsuccessfully matched"))

# Function to cut and create labels
source("~/Desktop/Chicago-TNC-analysis/quantiles_cut.R")
dropoffs <- quantiles_cut(input = dropoffs, variable = "sum_trip", n_cut = 10, n_round = 0)

chicago %>%  left_join(dropoffs, by = "GEOID") %>% filter(!is.na(sum_trip_cut)) %>% 
  ggplot(aes(fill = sum_trip_cut)) + 
  geom_sf(color = NA) + 
  theme_bw() + 
  labs(fill = "" , 
       title = "Number of Total Trips",
       subtitle = "By Drop Off Census Tract")+
  scale_fill_viridis_d(direction = -1)  

dropoffs <- quantiles_cut(input = dropoffs, variable = "ratio_shared_ok", n_cut = 10)
chicago %>%  left_join(dropoffs, by = "GEOID") %>% filter(!is.na(ratio_shared_ok_cut)) %>% 
  ggplot(aes(fill = ratio_shared_ok_cut)) + 
  geom_sf(color = NA) + 
  theme_bw() + 
  labs(fill = "" , 
       title = "Ratio of Trips Authorized for Shared Ride",
       subtitle = "By Drop Off Census Tract")+
  scale_fill_viridis_d(direction = 1)

Section 2: Zone-to-zone level analysis

In this section, we focus on OD pair analysis. This means, we aggregate trips between each two census tract. We observe 136074 OD pairs. Following figures show descriptive stats of zone-to-zone levels (OD pairs). In other words, we group the data to pairs of origin-destination where the new dataframe has 136074 rows.

ODPAIRS <- data %>% 
  group_by(pickup_census_tract, dropoff_census_tract) %>% 
 summarise("sum_trip" = n(),                          # total number of trips
            "sum_trip_shared_ok" = sum(shared_authorized > 0), # number of trip authorized to be shared
            "sum_trip_shared" = sum(pooled[shared_authorized > 0] > 0)) %>%           # number of trips with successful shared match
  mutate("ratio_shared_ok" = sum_trip_shared_ok / sum_trip,
         "ratio_matched" = sum_trip_shared / sum_trip_shared_ok) %>% 
    drop_na()

  ODPAIRS %>% 
    select(ratio_shared_ok, ratio_matched, pickup_census_tract, dropoff_census_tract) %>% 
    drop_na() %>% 
    pivot_longer(c("ratio_shared_ok", "ratio_matched")) %>% 
  ggplot()+
    geom_violin(aes(x= name, y= value)) + 
    geom_boxplot(aes(x = name, y = value ), width = 0.02) +
    theme_bw() + labs(title = "OD Pairs", x = "", y = "Ratio") +
    scale_x_discrete(labels=c("ratio_shared_ok" = "ratio of trips\nauthorized to\nbe shared",
                              "ratio_matched" = "ratio of shared\nauthorized trips\nsuccessfully matched"))

# summary statistics of number of trips between each two tracts
summary(ODPAIRS$sum_trip)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     1.00     2.00     8.00    56.73    27.00 16936.00
# summary statistics of ratio of trips between each two tracts authorized to be shared
summary(ODPAIRS$ratio_shared_ok)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.005952 0.133333 0.285714 0.402392 0.500000 1.000000
# summary statistics of ratio of MATCHED trips between each two tracts which were authorized to be shared (success rate)
summary(ODPAIRS$ratio_matched)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.5000  0.8654  0.6922  1.0000  1.0000

An Interesting Map

The following maps needs some explanation to be clear. In the previous section, after finding the OD pairs, I found the share of trips that were authorized to be shared (ratio of willingness to pool (WTP) between two tracts).

Here, I average ratio of WTP over the pick up tract. It means that I group OD pairs by the pick up tract and then average the ratio of WTP. The message that the figure carries is not substantially different from other charts: WTP is higher in south and west neighborhoods.

ODPAIRS_average_shared_ratio <- ODPAIRS %>% 
  group_by(pickup_census_tract) %>% 
  summarise("average_ratio_shared_ok_pickup" = mean(ratio_shared_ok)) %>% 
  mutate(pickup_census_tract = as.character(pickup_census_tract))  


chicago %>%  left_join(ODPAIRS_average_shared_ratio, by = c("GEOID" = "pickup_census_tract")) %>% filter(!is.na(average_ratio_shared_ok_pickup)) %>% 
  ggplot(aes(fill = average_ratio_shared_ok_pickup)) + 
  geom_sf(color = NA) + 
  theme_bw() + 
  labs(fill = "" , 
       title = "Average Ratio of Shared Authorized Trip in OD Pairs",
       subtitle = "(Averaged Over Pick Up Census Tract)")+
  scale_fill_viridis(direction = -1, limits = c(0,1))  

New Figures as of 5/25/2020

Daily Pick-ups in Central and out of Central

pickups <- data %>% 
  mutate("start_central" = if_else(start_neighborhood == "Central", true = 1, 0),
         "start_no_central" = if_else(start_neighborhood != "Central", true = 1, 0)) %>% 
  group_by(start_date) %>% 
  summarise("central_trips" = sum(start_central, na.rm = T),  # daily number of trips in central   
            "no_central_trips" = sum(start_no_central, na.rm = T),  # daily number of trips NOT in central  
            "pct_sharedOK_central_trips" = (sum(start_central[shared_authorized > 0], na.rm = T) / sum(start_central, na.rm = T)),
            # daily percent of total rides that are (authorized to be) shared in central
            "pct_sharedOK_no_central_trips" = (sum(start_no_central[shared_authorized > 0], na.rm = T) / sum(start_no_central, na.rm = T))  
            # daily percent of total rides that are (authorized to be) shared OUT of central
            ) %>% 
   mutate(start_date = as.Date(start_date))


ggplot(pickups, aes(x = start_date))+
  geom_line(aes(y = central_trips, color = "Number of Trips Started in Central")) +
  geom_line(aes(y = no_central_trips, color = "Number of Trips Started OUT of Central")) +
  scale_colour_manual(values = c("blue", "red")) + 
  scale_y_continuous(labels = comma)+ theme_bw()+
  theme(legend.position="bottom", 
        legend.title = element_blank(), legend.text=element_text(size=8)) +
  scale_x_date(breaks=date_breaks("week")) +
  labs(title =  "Daily Total Number of Trips",
       subtitle = "started in central and out of central",
       y = "", x = "")

ggplot(pickups, aes(x = start_date))+
  geom_line(aes(y = 100 * pct_sharedOK_central_trips, color = "Percentage of Trips Started in Central\nWhich Authorized Shared Ride")) +
  geom_line(aes(y = 100* pct_sharedOK_no_central_trips, color = "Percentage of Trips Started OUT of Central\nWhich Authorized Shared Ride")) +
  scale_colour_manual(values = c("blue", "red")) + 
  scale_y_continuous(labels = comma)+ theme_bw()+
  theme(legend.position="bottom", 
        legend.title = element_blank(), legend.text=element_text(size=8)) +
  scale_x_date(breaks=date_breaks("week")) +
  labs(title =  "Daily Percentage of Trips Authorized Shared Ride",
       subtitle = "started in central and out of central",
       y = "%", x = "")

pickups1 <- data %>% 
  mutate("start_central_6to22" = if_else(start_neighborhood == "Central" & 
                                           between(start_hour, 6, 21), true = 1, 0),
         ) %>% 
  group_by(start_date) %>% 
  summarise("pct_surcharge_central_trips" = (sum(start_central_6to22[shared_authorized < 1], na.rm = T) / sum(start_central_6to22, na.rm = T))  
            # percentage of trips in central between 6 AM to 10 PM which did not authorize to be shared 
            ) %>% 
   mutate(start_date = as.Date(start_date))

g.subtitle <- expression(paste("Which ", bold("Unauthorized")," Shared Ride"))

ggplot(pickups1, aes(x = start_date))+
  geom_line(aes(y = 100 * pct_surcharge_central_trips)) + theme_bw()+
  theme(legend.position="bottom", 
        legend.title = element_blank(), legend.text=element_text(size=8)) +
  scale_x_date(breaks=date_breaks("week")) +
  labs(title =  "Daily Percentage of Trips in Central Started Between 6 AM & 10", 
       subtitle = g.subtitle, y = "%", x = "",
       caption = "This trips will be assessed a $3 tax under new policy")

New Figures as of 5/26/2020

Hourly Pick-ups in Central and out of Central

pickups <- data %>% 
  mutate("start_central" = if_else(start_neighborhood == "Central", true = 1, 0),
         "start_no_central" = if_else(start_neighborhood != "Central", true = 1, 0)) %>% 
  group_by(hour=floor_date(start, "hour")) %>% 
  summarise("central_trips" = sum(start_central, na.rm = T),  # daily number of trips in central   
            "no_central_trips" = sum(start_no_central, na.rm = T),  # daily number of trips NOT in central  
            "pct_sharedOK_central_trips" = (sum(start_central[shared_authorized > 0], na.rm = T) / sum(start_central, na.rm = T)),
            # daily percent of total rides that are (authorized to be) shared in central
            "pct_sharedOK_no_central_trips" = (sum(start_no_central[shared_authorized > 0], na.rm = T) / sum(start_no_central, na.rm = T))  
            # daily percent of total rides that are (authorized to be) shared OUT of central
            )

ggplot(pickups, aes(x = hour))+
  geom_line(aes(y = central_trips, color = "Number of Trips Started in Central")) +
  geom_line(aes(y = no_central_trips, color = "Number of Trips Started OUT of Central")) +
  scale_colour_manual(values = c("blue", "red")) + 
  scale_y_continuous(labels = comma)+ theme_bw()+
  theme(legend.position="bottom", 
        legend.title = element_blank(), legend.text=element_text(size=10)) +
  scale_x_datetime(breaks=date_breaks("12 hour"),  date_labels = "%m/%d %H:%M") +
  labs(title =  "Hourly Total Number of Trips",
       subtitle = "started in central and out of central",
       y = "", x = "") + theme(axis.text.x = element_text(angle = 90, hjust = 1))

ggplot(pickups, aes(x = hour))+
  geom_line(aes(y = 100 * pct_sharedOK_central_trips, color = "Percentage of Trips Started in Central\nWhich Authorized Shared Ride")) +
  geom_line(aes(y = 100* pct_sharedOK_no_central_trips, color = "Percentage of Trips Started OUT of Central\nWhich Authorized Shared Ride")) +
  scale_colour_manual(values = c("blue", "red")) + 
  scale_y_continuous(labels = comma)+ theme_bw()+
  theme(legend.position="bottom", 
        legend.title = element_blank(), legend.text=element_text(size=10)) +
  scale_x_datetime(breaks=date_breaks("12 hour"),  date_labels = "%m/%d %H:%M") +
  labs(title =  "Hourly Percentage of Trips Authorized Shared Ride",
       subtitle = "started in central and out of central",
       y = "%", x = "") + theme(axis.text.x = element_text(angle = 90, hjust = 1))

Demean Percentage Shared Trend

pickups <- data %>% 
  mutate("start_central" = if_else(start_neighborhood == "Central", true = 1, 0),
         "start_no_central" = if_else(start_neighborhood != "Central", true = 1, 0)) %>% 
  group_by(start_date) %>% 
  summarise("central_trips" = sum(start_central, na.rm = T),  # daily number of trips in central   
            "no_central_trips" = sum(start_no_central, na.rm = T),  # daily number of trips NOT in central  
            "pct_sharedOK_central_trips" = (sum(start_central[shared_authorized > 0], na.rm = T) / sum(start_central, na.rm = T)),
            # daily percent of total rides that are (authorized to be) shared in central
            "pct_sharedOK_no_central_trips" = (sum(start_no_central[shared_authorized > 0], na.rm = T) / sum(start_no_central, na.rm = T))  
            # daily percent of total rides that are (authorized to be) shared OUT of central
            ) %>% 
   mutate(start_date = as.Date(start_date))


pickups_long <- pickups %>% 
  select(start_date, pct_sharedOK_central_trips, pct_sharedOK_no_central_trips) %>% 
  pivot_longer(-start_date, values_to = "pct_sharedOK", names_to = "central")

head(pickups_long)
## # A tibble: 6 x 3
##   start_date central                       pct_sharedOK
##   <date>     <chr>                                <dbl>
## 1 2019-11-01 pct_sharedOK_central_trips          0.0820
## 2 2019-11-01 pct_sharedOK_no_central_trips       0.159 
## 3 2019-11-02 pct_sharedOK_central_trips          0.0670
## 4 2019-11-02 pct_sharedOK_no_central_trips       0.142 
## 5 2019-11-03 pct_sharedOK_central_trips          0.0837
## 6 2019-11-03 pct_sharedOK_no_central_trips       0.161
fit <- lm(pct_sharedOK ~ factor(start_date) + factor(central), data = pickups_long)
fit_reduced <- lm(pct_sharedOK ~ factor(start_date) , data = pickups_long)
par(mfrow = c(2,2)); plot(fit)

fitted <- predict(fit, newdata = pickups_long)
pickups_long <- data.frame(pickups_long, fitted, fit$residuals)
ggplot(pickups_long) +
  geom_point(aes(x= fitted, y = `fit.residuals`, color = central)) + 
   theme_bw()+ theme(legend.position="bottom") +
  labs(y = "Residuals", x = "Fitted Values") +
  scale_color_discrete(labels = c("Started in Central", "Started OUT of Central"))

ggplot(pickups_long) +
  geom_point(aes(x= start_date, y = `fit.residuals`, color = central)) + 
   theme_bw()+ theme(legend.position="bottom") +
  labs(y = "Residuals", x = "Date") +
  scale_color_discrete(labels = c("Started in Central", "Started OUT of Central"))

fitted <- predict(fit_reduced, newdata = pickups_long)
pickups_long <- data.frame(pickups_long, fitted, fit_reduced$residuals)
ggplot(pickups_long) +
  geom_point(aes(x= fitted.1, y = `fit_reduced.residuals`, color = central)) + 
   theme_bw()+ theme(legend.position="bottom") +
  labs(y = "Residuals", x = "Fitted Values") +
  scale_color_discrete(labels = c("Started in Central", "Started OUT of Central"))

ggplot(pickups_long) +
  geom_point(aes(x= start_date, y = `fit_reduced.residuals`, color = central)) + 
   theme_bw()+ theme(legend.position="bottom") +
  labs(y = "Residuals", x = "Date") +
  scale_color_discrete(labels = c("Started in Central", "Started OUT of Central"))